# -*- coding: utf-8 -*-
import math
import sys, jieba
from gensim import corpora, models, similarities
#reload(sys)
#sys.setdefaultencoding('utf8')

texts = [
'什么是股票？',
'股票是个什么玩意？',
'新手怎样入门炒股？',
'现在股市的风险大吗？',
'python的自然语言处理',
'gensim的主要功能有把文本转为向量',
'提供存储文本矩阵的方法',
'这篇文章主要讨论如何用python来做一些简单的文本处理'
]
docs = [jieba.lcut_for_search(i) for i in texts]

dic = corpora.Dictionary(docs)
corpus = [dic.doc2bow(i) for i in docs]
print(corpus)

lsi = models.LsiModel(corpus, id2word=dic, num_topics=2)

qurey1 = '股票'
qurey2 = '文本'
vec_query1 = dic.doc2bow([qurey1])
vec_query2 = dic.doc2bow([qurey2])

index = similarities.MatrixSimilarity(lsi[corpus])

sims1 = index[lsi[vec_query1]]
sims2 = index[lsi[vec_query2]]
sims_result1 = sorted(enumerate(sims1), key=lambda item: -item[1])
sims_result2 = sorted(enumerate(sims2), key=lambda item: -item[1])
print(sims_result1)
print(sims_result2)

print(math.log10(3/2))